{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "4011807d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "45df07dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "sys.path.append(\"..\")\n",
    "from benchmark import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4d10abc",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0d64ca07",
   "metadata": {},
   "outputs": [],
   "source": [
    "# the ground-truth files on https://big-ann-benchmarks.com/\n",
    "\n",
    "\n",
    "new_gt = {\n",
    "    'bigann-1B': \"https://comp21storage.blob.core.windows.net/publiccontainer/comp21/bigann/public_query_gt100.bin\", \n",
    "    \"ssnpp-1B\": \"https://dl.fbaipublicfiles.com/billion-scale-ann-benchmarks/FB_ssnpp_public_queries_GT.rangeres\",\n",
    "    'msturing-1B': \"https://comp21storage.blob.core.windows.net/publiccontainer/comp21/MSFT-TURING-ANNS/query_gt100.bin\",\n",
    "    \"msspacev-1B\": \"https://comp21storage.blob.core.windows.net/publiccontainer/comp21/spacev1b/public_query_gt100.bin\", \n",
    "    \"deep-1B\": \"https://storage.yandexcloud.net/yandex-research/ann-datasets/deep_new_groundtruth.public.10K.bin\", \n",
    "    \"text2image-1B\": \"https://storage.yandexcloud.net/yandex-research/ann-datasets/t2i_new_groundtruth.public.100K.bin\",\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "4dd26410",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset BigANNDataset in dimension 128, with distance euclidean, search_type knn, size: Q 10000 B 1000000000\n",
      "Dataset SSNPPDataset in dimension 256, with distance euclidean, search_type range, size: Q 100000 B 1000000000\n",
      "Dataset MSTuringANNS in dimension 100, with distance euclidean, search_type knn, size: Q 100000 B 1000000000\n",
      "Dataset MSSPACEV1B in dimension 100, with distance euclidean, search_type knn, size: Q 29316 B 1000000000\n",
      "Dataset Deep1BDataset in dimension 96, with distance euclidean, search_type knn, size: Q 10000 B 1000000000\n",
      "Dataset Text2Image1B in dimension 200, with distance ip, search_type knn, size: Q 100000 B 1000000000\n"
     ]
    }
   ],
   "source": [
    "# get official GT file \n",
    "\n",
    "\n",
    "for dsname in new_gt: \n",
    "    ds = datasets.DATASETS[dsname]()\n",
    "    print(ds)\n",
    "    \n",
    "    data = urllib.request.urlopen(new_gt[dsname]).read()\n",
    "    open(f\"/tmp/new_GT/{dsname}\", \"wb\").write(data)\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "id": "c0c2545b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_diff_1_result(Dref, Iref, Dnew, Inew, eps): \n",
    "    \"\"\" compare knn search results. Intended to normalize for: \n",
    "    - small variations of the distance measure (below eps)\n",
    "    - ordering of ties\n",
    "    \"\"\"\n",
    "    if not np.all(Dref == Dnew): \n",
    "        assert np.abs(Dref - Dnew).max() < eps\n",
    "        # attempt to do some normalization to merge nearby distances \n",
    "        Dref = np.floor(np.minimum(Dref, Dnew) / eps) * eps           \n",
    "    \n",
    "    ndiff = 0\n",
    "    cur_d = -1e10\n",
    "    s_ref = set()\n",
    "    s_new = set()\n",
    "    for j in range(len(Iref)): \n",
    "        if Dref[j] != cur_d: \n",
    "            nd = len(s_ref ^ s_new)\n",
    "            ndiff += nd\n",
    "            if nd > 0: \n",
    "                pass\n",
    "                # print(i, cur_d, s_ref, s_new)\n",
    "            s_ref = set()\n",
    "            s_new = set()\n",
    "            cur_d = Dref[j]\n",
    "        s_ref.add(Iref[j])\n",
    "        s_new.add(Inew[j])             \n",
    "    return ndiff\n",
    "\n",
    "def compare_knn_res(Dref, Iref, Dnew, Inew): \n",
    "\n",
    "    ndiff = 0\n",
    "    eps = Dref.max() * 1e-5\n",
    "    for i in range(len(Iref)):\n",
    "        \n",
    "        if np.all(Iref[i] == Inew[i]): \n",
    "            continue\n",
    "     \n",
    "        ndiff += count_diff_1_result(Dref[i], Iref[i], Dnew[i], Inew[i], eps)\n",
    "    \n",
    "\n",
    "    return ndiff"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "af4affa2",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bigann-1B Dataset BigANNDataset in dimension 128, with distance euclidean, search_type knn, size: Q 10000 B 1000000000\n",
      "raw_diff=0.9899 % diff=0.0 %\n",
      "(10000, 100) (10000, 100)\n",
      "ssnpp-1B Dataset SSNPPDataset in dimension 256, with distance euclidean, search_type range, size: Q 100000 B 1000000000\n",
      "(7706752,) (7706752,)\n",
      "msturing-1B Dataset MSTuringANNS in dimension 100, with distance euclidean, search_type knn, size: Q 100000 B 1000000000\n",
      "raw_diff=0.0195 % diff=0.00024 %\n",
      "(100000, 100) (100000, 100)\n",
      "msspacev-1B Dataset MSSPACEV1B in dimension 100, with distance euclidean, search_type knn, size: Q 29316 B 1000000000\n",
      "raw_diff=24.181163869559285 % diff=0.0 %\n",
      "(29316, 100) (29316, 100)\n",
      "deep-1B Dataset Deep1BDataset in dimension 96, with distance euclidean, search_type knn, size: Q 10000 B 1000000000\n",
      "raw_diff=0.1864 % diff=0.0002 %\n",
      "(10000, 100) (10000, 100)\n",
      "text2image-1B Dataset Text2Image1B in dimension 200, with distance ip, search_type knn, size: Q 100000 B 1000000000\n",
      "raw_diff=0.04773 % diff=0.0 %\n",
      "(100000, 100) (100000, 100)\n"
     ]
    }
   ],
   "source": [
    "# compare with what I computed \n",
    "new_basedir = \"/checkpoint/matthijs/billion-scale-ann-benchmarks/GT_1B/\"\n",
    "\n",
    "for dsname in new_gt: \n",
    "    ds = datasets.DATASETS[dsname]()\n",
    "    print(dsname, ds)\n",
    "    if ds.search_type() == \"knn\": \n",
    "        Iref, Dref = datasets.knn_result_read(f\"/tmp/new_GT/{dsname}\")\n",
    "        Inew, Dnew = datasets.knn_result_read(f\"{new_basedir}/{dsname}\")\n",
    "        raw_ndiff = (Iref != Inew).sum()\n",
    "        ndiff = compare_knn_res(Dref, Iref, Dnew, Inew)        \n",
    "        print(f\"raw_diff={100 * raw_ndiff/ Iref.size} % diff={100 * ndiff/ Iref.size} %\")\n",
    "        \n",
    "    else: \n",
    "        nres_ref, Iref, Dref = datasets.range_result_read(f\"/tmp/new_GT/{dsname}\")\n",
    "        nres_new, Inew, Dnew = datasets.range_result_read(f\"{new_basedir}/{dsname}\")\n",
    "        # does not make much sense to verify, they are computed simultaneously\n",
    "        \n",
    "    print(Iref.shape, Inew.shape)\n",
    "    \n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8b100230",
   "metadata": {},
   "source": [
    "# Check subsets -- range"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a069dc1d",
   "metadata": {},
   "source": [
    "Make sure the 10M and 100M results are a subset of 1B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "fd4bebff",
   "metadata": {},
   "outputs": [],
   "source": [
    "dsname = \"ssnpp-1B\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "1ba55156",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_basedir = \"/checkpoint/matthijs/billion-scale-ann-benchmarks/GT_1B/\"\n",
    "\n",
    "nres_ref, Iref, Dref = datasets.range_result_read(f\"/tmp/new_GT/{dsname}\")\n",
    "nres_new, Inew, Dnew = datasets.range_result_read(f\"{new_basedir}/{dsname}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "2f570eb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "for nb, ss in [(10 ** 7, \"10M\"), (10 ** 8, \"100M\")]: \n",
    "    ds_sub = dsname.replace(\"1B\", ss)\n",
    "    nres_sub, Isub, Dsub = datasets.range_result_read(f\"/checkpoint/matthijs/billion-scale-ann-benchmarks/GT_{ss}/{ds_sub}\")\n",
    "    \n",
    "    nq = len(nres_ref)\n",
    "    assert len(nres_sub) == nq\n",
    "    i0 = j0 = 0\n",
    "    for i in range(nq): \n",
    "        i1 = i0 + nres_ref[i]\n",
    "        j1 = j0 + nres_sub[i]\n",
    "\n",
    "        ref_res = Iref[i0:i1]\n",
    "        sub_res = Isub[j0:j1]\n",
    "\n",
    "        ref_res_sub = ref_res[ref_res < nb]\n",
    "        assert set(ref_res_sub) == set(sub_res)\n",
    "\n",
    "        i0 = i1\n",
    "        j0 = j1\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eed490b8",
   "metadata": {},
   "source": [
    "# Check subsets -- knn"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b4b5f7a9",
   "metadata": {},
   "source": [
    "Make sure the 10M and 100M results are a subset of 1B in knn sense "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "7d846214",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bigann-1B\n",
      "10M diff=0.0 % (verif on 10175 / 1000000 = 1/98.3)\n",
      "100M diff=0.0 % (verif on 99455 / 1000000 = 1/10.1)\n",
      "msturing-1B\n",
      "10M diff=0.0 % (verif on 99896 / 10000000 = 1/100.1)\n",
      "100M diff=0.0 % (verif on 1000758 / 10000000 = 1/10.0)\n",
      "msspacev-1B\n",
      "10M diff=0.0 % (verif on 30801 / 2931600 = 1/95.2)\n",
      "100M diff=0.0 % (verif on 293540 / 2931600 = 1/10.0)\n",
      "deep-1B\n",
      "10M diff=0.0 % (verif on 10285 / 1000000 = 1/97.2)\n",
      "100M diff=0.0 % (verif on 100663 / 1000000 = 1/9.9)\n",
      "text2image-1B\n",
      "10M diff=0.0 % (verif on 99944 / 10000000 = 1/100.1)\n",
      "100M diff=0.0 % (verif on 999862 / 10000000 = 1/10.0)\n"
     ]
    }
   ],
   "source": [
    "basedir = \"/checkpoint/matthijs/billion-scale-ann-benchmarks/GT\"\n",
    "\n",
    "for dsname in new_gt: \n",
    "    if dsname == \"ssnpp-1B\": \n",
    "        continue\n",
    "    print(dsname)\n",
    "    I1B, D1B = datasets.knn_result_read(f\"{basedir}_1B/{dsname}\")\n",
    "    nq = len(I1B)\n",
    "    ndiff = 0\n",
    "    eps = D1B.max() * 1e-5\n",
    "    \n",
    "    for nb, ss in [(10 ** 7, \"10M\"), (10 ** 8, \"100M\")]: \n",
    "        ds_sub = dsname.replace(\"1B\", ss)\n",
    "        Iss, Dss = datasets.knn_result_read(f\"{basedir}_{ss}/{ds_sub}\")\n",
    "        ndiff = 0\n",
    "        ltot = 0\n",
    "        \n",
    "        for i in range(nq): \n",
    "            ref_I = I1B[i][I1B[i] < nb]\n",
    "            ref_D = D1B[i][I1B[i] < nb]\n",
    "            \n",
    "            l = len(ref_I)\n",
    "            ndiff += count_diff_1_result(ref_D, ref_I, Dss[i, :l], Iss[i, :l], eps)\n",
    "            ltot += l\n",
    "            \n",
    "        print(f\"{ss} diff={100 * ndiff / ltot} % (verif on {ltot} / {I1B.size} = 1/{I1B.size/ltot:.1f})\")\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b704d902",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
